1 Setup

library(tidyverse)
library(here)
library(readr)
library(readxl)
library(data.table)

1.1 Loading in cleaned data

candy <- read_csv(here("clean_data/candy_clean.csv"),
# specifying data types
col_types = cols(gender = col_character(), country = col_character()))

2 Data

2.1 Raw

# 1/3 for illustration used
"raw_data/boing-boing-candy-2015.xlsx" %>%
    here() %>%
    read_excel() %>% 
    data.table()

2.2 Clean data

candy %>% 
    data.table()

3 Questions

3.1 What is the total number of candy ratings given across the three years?

candy %>%
# filtering out NAs
  filter(!is.na(rating)) %>%
  nrow()
## [1] 757108

3.2 What was the average age of people who are going out trick or treating and the average age of people not going trick or treating?

candy %>%
  group_by(going_out) %>%
# Rounding to whole years
  summarise(avg_age = round(mean(age, na.rm = TRUE))) %>%
# Yes first
    arrange(desc(going_out))
## `summarise()` ungrouping output (override with `.groups` argument)

3.3 For each of joy, despair and meh, which candy bar recieved these ratings the most?

candy %>%
# Filtering out NAs
  filter(!is.na(rating)) %>%
  group_by(rating, candy) %>%
  summarise(
    count = n()
  ) %>%
  filter(count == max(count))
## `summarise()` regrouping output by 'rating' (override with `.groups` argument)

3.4 How many people rated Starburst as despair?

candy %>%
  filter(candy == "starburst") %>%
  filter(rating == "despair") %>%
  summarise(id_count = n_distinct(person_id))

For the next two questions, count despair as -1, joy as +1 and meh as 0.

candy <-
  candy %>%
  mutate(rating_numeric = recode(rating,
                                 joy = 1,
                                 despair = -1,
                                 meh = 0))